/***************************************************************************************
Firm-embedded productivity and cross-country income differences
Alviarez, Cravino and Ramondo
Journal of Political Economy (2022)

Program: prep_01_forreg_exclexp.do
Date: October 2022

Description: Constructiong bilateral shares at the level of the firm, sin(phi), excluding exports for a subset of countries with available export information

*****************************************************************************************/


clear all
set more off
local var0 sales /*Key variable of interest*/
local varset sales emp exports va labcost /*all relevant variables*/
local types "naics"


etime, start
foreach type in `types' {

display "`type'"

use year isocode id_bvd guo_bvd hq `varset' sector* NAICS* con_code using "${data}/firm_allyears_sales.dta", clear 
drop if `var0'==. | `var0'==0
tab con_code
drop if con_code=="$lf"
drop con_code
tab sector


*Bring the age of the firm*
*-----------------------------------------------------
gen bvdidnumber=id_bvd
foreach xx in year_1_1990  year_1990_2000 year_2000_2005 year_2005_2010 year_2010_2015 year_2015 {
display "`xx'"
merge m:1 bvdidnumber using "${data}/bvdid_incorportation_`xx'.dta", keepusing(year_unit) update
drop if _merge==2
drop _merge 
}
gen firm_age=year-year_unit
sum firm_age, d
$conditionforage 


*Choosing the industry level 
*-----------------------------------------------------
if "`type'"=="naics4" {
drop if sector=="NA"
tab sector
local industry NAICS4 
}
if "`type'"=="naics3" {
drop if sector=="NA"
tab sector
local industry NAICS3 
}
if "`type'"=="naics" {
drop if sector=="NA"
tab sector
local industry sector 
}
if "`type'"=="market" {
keep if sector1=="Manufacturing (C)" | sector1=="Market_Services (G-H-I-J-K-M-N-R-S-T)" | sector1=="Other_Goods (A-B-D-E-F)" 
drop sector*
gen sector="Market (MARKT)"
gen sector1=sector
local industry sector 
}
if "`type'"=="MS" {
keep if sector1=="Manufacturing (C)" |  sector1=="Market_Services (G-H-I-J-K-M-N-R-S-T)" 
drop sector*
gen sector="MS"
gen sector1=sector
local industry sector 
}
if "`type'"=="manuf" {
keep if sector1=="Manufacturing (C)" 
drop sector*
gen sector="Manufacturing (C)"
gen sector1=sector
local industry sector 
}
if "`type'"=="serv" {
keep if sector1=="Market_Services (G-H-I-J-K-M-N-R-S-T)" 
drop sector*
gen sector="Market_Services (G-H-I-J-K-M-N-R-S-T)"
gen sector1=sector
local industry sector 
}
*

*Collapse at the level of the parent-industry-isocode (this includes the sales of the GUO-isocode at home as well as in other countries)
*-----------------------------------------------------
sort year isocode sector1 sector `industry' hq guo_bvd
collapse (sum) `varset' (max) firm_age, by(year isocode sector1 sector `industry' hq guo_bvd)


*Keep only MNCs in the sample based on the number of countries (across ALL SECTORS) 
*----------------------------------------------------- 
by year guo_bvd isocode, sort: gen a=_n==1
tab a
by year guo_bvd, sort: egen num_iso=total(a)
drop a

gen MNC=0
replace MNC=1 if num_iso>1
tab MNC
tab MNC if isocode!=hq
drop if MNC==0 /*For identification an MNCs need to be in at least 2 foreign countries to be in the sample*/



*Rename each variable of interest in the dataset
*----------------------------------------------------- 
foreach vv of local varset {
display "`vv'"
rename `vv' sin_`vv'
}
sort year hq isocode
keep if sin_`var0'!=. & sin_`var0'!=0 
compress



*Bring aggregate Klems-OECD data --- Xn, Xnn, Xn-exp (this files were prepared in 'aggregate_data_sales.do' files ----
*-----------------------------------------------------
merge m:1 year sector isocode using "${data}/klems_oecd_unido_orbis_sales_emp_exp.dta", keepusing(EMP* GO_usd GO_usd_exp VA_usd exports LAB_usd data_source) 
keep if year>=2005 & year<=2017
drop if _merge==2 
drop _merge

*Rename KLEMS/OECD variables  
rename VA_usd yn_va_ko
rename EMPE yn_emp_ko
rename exports yn_exp_ko
rename GO_usd yn_`var0'_ko
rename LAB_usd yn_labcost_ko
rename GO_usd_exp yn_`var0'_exp_ko
format %9.0fc yn_`var0'_ko yn_`var0'_exp_ko yn_exp_ko yn_va_ko yn_labcost_ko

tempfile t0
save `t0', replace


*Keep only the aggregate data and then bring it to the firm level data
*-----------------------------------------------------
use `t0', clear
keep year isocode sector1 sector `industry' yn_`var0'_ko yn_emp_ko  yn_va_ko yn_`var0'_exp_ko  yn_exp_ko yn_labcost_ko 
duplicates drop 
collapse (sum) yn_`var0'_ko yn_emp_ko yn_va_ko yn_`var0'_exp_ko  yn_exp_ko yn_labcost_ko, by(year isocode sector1 sector `industry')
tempfile t1
save `t1', replace

use `t0', clear
collapse  (sum) sin_sales sin_emp sin_va sin_exp sin_labcost (max) firm_age, by(year guo_bvd isocode hq  sector1 sector `industry')
merge m:1 year isocode sector1 sector `industry' using `t1'
drop _merge

foreach i in sin_sales sin_emp sin_va sin_exp sin_labcost yn_`var0'_ko yn_emp_ko yn_`var0'_exp_ko  yn_exp_ko yn_labcost_ko  {
replace `i'=. if `i'==0
}
tempfile final
save `final', replace


*Construct the relevant shares using aggregate KLEMS/OECD
*-----------------------------------------------------
use `final', clear

display "Drop if revenues are below 100,000 USD" 
sum sin_`var0', d
sum sin_`var0' if sin_`var0'<0.1 
drop if sin_`var0'<0.1 

gen sin_emp_ko=sin_emp/yn_emp_ko
gen sin_va_ko=sin_va/yn_va_ko
gen sin_labcost_ko=sin_labcost/yn_labcost_ko
gen sin_`var0'_ko=sin_`var0'/yn_`var0'_ko
gen sin_`var0'_exp_ko=(sin_`var0'-sin_exp)/yn_`var0'_exp_ko

*Drop observations outside the range (only happens for HK, SG and JP)
drop if sin_sales_ko>=1 | sin_sales_ko<=0 | sin_sales_ko==.
replace sin_emp_ko=. if sin_emp_ko>=1 | sin_emp_ko<=0
replace sin_va_ko=. if sin_va_ko>=1 | sin_va_ko<=0
replace sin_labcost_ko=. if sin_labcost_ko>=1 | sin_labcost_ko<=0
replace sin_`var0'_exp_ko=. if sin_`var0'_exp_ko>=1 | sin_`var0'_exp_ko<=0



*Excluding exports from the denominator for a set of countries (Apppendix F in the paper)
**********************************************************************************************
gen dummy_temp=0
foreach xx in DE EE FR GB GR HR HU SI {
display "`xx'"
replace dummy_temp=1 if isocode=="`xx'"
}

*Note: Substract exports (from numerator and denominator) only when exports data is available. Moreover, the original baseline is adjusted by only using firms with export information for the group of selected countries. 
tab sector1
gen xx=sin_exp 
gen sin_`var0'_exp_ko_alt=sin_`var0'_ko
replace sin_`var0'_exp_ko_alt=(sin_`var0'-xx)/yn_`var0'_exp_ko if sector1=="Manufacturing (C)" & dummy_temp==1
drop xx

*Note: for the adjusted baseline
replace sin_`var0'_ko=. if sin_`var0'_exp_ko_alt==. & sector1=="Manufacturing (C)" & dummy_temp==1
drop dummy_temp
**********************************************************************************************


*Keep only MNC (two countries, which includes two or more affiliates) in a given sector -- notice this is more demanding that the applied restrictions above, but necessary for the sectoral level regressions
*----------------------------------------------
by year sector guo_bvd isocode, sort: gen a=_n==1
by year sector guo_bvd, sort: egen b=total(a)
tab b 
keep if b>=2
drop a b 
format %9.4fc sin* 
format %9.0fc *`var0' yn* 
sort sector1 sector `industry' hq isocode
order sector1 sector `industry' hq isocode sin_`var0'_ko sin_`var0'_exp_ko 



*Create percentiles 
*----------------------------------------------
foreach pos in 10 20 30 40 50 60 70 80 90 {
display `pos'
gsort year isocode sector1 sector `industry' -sin_sales
by year isocode sector1 sector `industry': egen p`pos'_sales = pctile(sin_sales), p(`pos')
format %9.3fc p`pos'_sales
label var p`pos'_sales "pctile `pos' of sin_sales"
}
*


*Number of countries in which the affiliate operates (outside home)
*----------------------------------------------
sort year sector1 `industry' hq guo_bvd isocode
by year sector1 `industry' hq guo_bvd isocode, sort: gen a=_n==1
replace a=0 if isocode==hq
by year sector1 `industry' hq guo_bvd, sort: egen num_ctry=total(a)
drop a
label var num_ctry "number of foreign countries GUO operates within `type'"

tempfile LCS
save `LCS', replace


*Firms in the Largest Connected Set 
*----------------------------------------------
use `LCS', clear
keep year `industry' guo_bvd isocode iso sin_`var0' sin_`var0'_ko yn_`var0'_ko
duplicates drop
sort year isocode guo_bvd
egen iso=group(isocode)
egen guo=group(guo_bvd)
egen sss=group(`industry')
order year `industry' guo_bvd isocode sss guo iso sin_`var0' sin_`var0'_ko yn_`var0'_ko
sort year sss iso guo 
keep if isocode!=""
keep if isocode=="JP" | isocode=="IT" | isocode=="KR" | isocode=="DE" | isocode=="FR" | isocode=="ES" |  isocode=="MX" |  isocode=="GB" |  isocode=="PL" |  isocode=="GR" |  isocode=="NL" |  isocode=="RO" |  isocode=="BE" |  isocode=="AT" |  isocode=="PT" |  isocode=="FI" |  isocode=="SE" |  isocode=="CZ" |  isocode=="SK" |  isocode=="BG" |  isocode=="HU" |  isocode=="HR" |  isocode=="LV" |  isocode=="SI" |  isocode=="LT" |  isocode=="EE" 
save "${output}/LCS_`type'.dta", replace 
keep year sss iso guo sin_`var0' sin_`var0'_ko yn_`var0'_ko
sort year sss iso guo 
order year sss guo iso sin_`var0' sin_`var0'_ko yn_`var0'_ko 
export delimited using "${output}/LCS_`type'", novarnames replace



*Labeling variables of interest 
*----------------------------------------------
use `LCS', clear
label var sin_emp_ko "sin_emp/yn_emp (den KLEMS/OECD)"
label var sin_va_ko "sin_va/yn_va (den KLEMS/OECD)"
label var sin_`var0'_ko "sin_`var0'/yn (den KLEMS/OECD)"
label var sin_`var0'_exp_ko "(sin_`var0'-sin_exp)/yn_`var0'_exp_ko (den KLEMS/OECD/WIOT)"
label var sin_`var0' "`var0' of i MNCs operating in n (cond operating in i)"
label var sin_emp "emp of i MNCs operating in n (cond operating in i)"
label var sin_exp "exports of i MNCs operating in n (cond operating in i)"
label var sin_va "value added of i MNCs operating in n (cond operating in i)"
label var sin_labcost "Labor cost of i MNCs operating in n (cond operating in i)"

label var yn_`var0'_ko "total `var0' in country n"
label var yn_emp_ko "total emp in country n"
label var yn_va_ko "total va in country n"
label var yn_labcost_ko "total labor cost in country n"
label var yn_exp_ko "total exports in country n"
label var yn_`var0'_exp_ko "total `var0' (minus exports) in country n"

compress

*Saving the file
*----------------------------------------------
drop if year==.
order year sector1 sector `industry' hq isocode sin_`var0'_ko sin_`var0'_exp_ko 
compress
tempfile sin_firmlevel_`type'
save `sin_firmlevel_`type'', replace
}
etime 



*Check for outliers
*----------------------------------------------
display "Drop foreing affiliates below 1MM USD and Parents below 5MM USD"
clear all
local whattype naics
use `sin_firmlevel_`whattype'', clear

sum sin_`var0', d
sum sin_`var0' if sin_`var0'<1 
tab isocode if sin_`var0'>=1
display "Drop firms below 1 MILLION USD" 

duplicates drop 
compress 
tempfile sample_firms
save `sample_firms', replace


*===============================================================================
*Temporary check on gravity variables
*===============================================================================
use "${data}/gravdata_00_16.dta", clear
keep if year==2016
replace year=2017
tempfile temp
save `temp', replace

use "${data}/gravdata_00_16.dta", clear
append using `temp'
tempfile gravdata_00_17
save `gravdata_00_17', replace



*Additions: include gravity variables and a dummy of constant MNC across years  
*----------------------------------------------
clear all
set more off

etime, start 
foreach type in `types' {

display "`type'"

if "`type'"=="naics4" {
local industry NAICS4 
}
if "`type'"=="naics3" {
local industry NAICS3 
}
else {
local industry sector
}

*Bring gravity variables 
*-----------------------------------------------
use `sin_firmlevel_`type'', clear

gen iso2_o=hq 
gen iso2_d=isocode
local trade_var distw fta_wto contig comlang_off colony  pop_o pop_d gdp_o gdp_d gdpcap_o gdpcap_d   
merge m:1 iso2_o iso2_d year using `gravdata_00_17', keepusing(`trade_var')

drop if _merge==2
drop _merge
foreach var in iso2_o iso2_d {
encode `var', gen(`var'b)
drop `var'
rename `var'b `var'
}
*
gen distance=ln(distw)
drop distw
drop iso2_o iso2_d
rename fta_wto RTA
label var distance "ln distance" 
label var RTA "Regional Trade Aggrement" 
label var comlang_off "Common Language" 
label var contig "commnon Border" 
label var colony "Former Colony" 
compress


*Flag MNCs number of years it has information (out of yy years)
*-----------------------------------------------
by sector1 sector guo_bvd isocode year, sort: gen a=_n==1
replace a=0 if year<2010
by sector1 sector guo_bvd isocode, sort: egen years_after2010=total(a)
drop a
by sector1 sector guo_bvd isocode year, sort: gen a=_n==1
by sector1 sector guo_bvd isocode, sort: egen years_allperiod=total(a)
drop a
gen sample_const=1
compress
save  "${data}/sin_firmlevel_`type'_gravity${lf}_exclexp.dta", replace
}
etime

*log close 

